Machine Learning Fundamentals

Author

Sneha Para

1 Libraries

# Import libraries
library(tidyverse)
#> ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
#> ✔ dplyr     1.1.4     ✔ readr     2.1.5
#> ✔ forcats   1.0.0     ✔ stringr   1.5.1
#> ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
#> ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
#> ✔ purrr     1.0.2     
#> ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
#> ✖ dplyr::filter() masks stats::filter()
#> ✖ dplyr::lag()    masks stats::lag()
#> ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyquant)
#> Loading required package: PerformanceAnalytics
#> Loading required package: xts
#> Loading required package: zoo
#> 
#> Attaching package: 'zoo'
#> 
#> The following objects are masked from 'package:base':
#> 
#>     as.Date, as.Date.numeric
#> 
#> 
#> ######################### Warning from 'xts' package ##########################
#> #                                                                             #
#> # The dplyr lag() function breaks how base R's lag() function is supposed to  #
#> # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
#> # source() into this session won't work correctly.                            #
#> #                                                                             #
#> # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
#> # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
#> # dplyr from breaking base R's lag() function.                                #
#> #                                                                             #
#> # Code in packages is not affected. It's protected by R's namespace mechanism #
#> # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
#> #                                                                             #
#> ###############################################################################
#> 
#> Attaching package: 'xts'
#> 
#> The following objects are masked from 'package:dplyr':
#> 
#>     first, last
#> 
#> 
#> Attaching package: 'PerformanceAnalytics'
#> 
#> The following object is masked from 'package:graphics':
#> 
#>     legend
#> 
#> Loading required package: quantmod
#> Loading required package: TTR
#> Registered S3 method overwritten by 'quantmod':
#>   method            from
#>   as.zoo.data.frame zoo
library(broom)
library(umap)

2 Data

# STOCK PRICES
sp_500_prices_tbl <- read_rds("F:/Sneha/ss23-bdml-SnehaPara/Dataset/sp_500_prices_tbl.rds")
sp_500_prices_tbl
# SECTOR INFORMATION
sp_500_index_tbl <- read_rds("F:/Sneha/ss23-bdml-SnehaPara/Dataset/sp_500_index_tbl.rds")
sp_500_index_tbl

2.1 Step 1 - Convert stock prices to a standardized format (daily returns)

sp_500_prices_tbl %>% glimpse()
#> Rows: 1,225,765
#> Columns: 8
#> $ symbol   <chr> "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT", "MSFT…
#> $ date     <date> 2009-01-02, 2009-01-05, 2009-01-06, 2009-01-07, 2009-01-08, …
#> $ open     <dbl> 19.53, 20.20, 20.75, 20.19, 19.63, 20.17, 19.71, 19.52, 19.53…
#> $ high     <dbl> 20.40, 20.67, 21.00, 20.29, 20.19, 20.30, 19.79, 19.99, 19.68…
#> $ low      <dbl> 19.37, 20.06, 20.61, 19.48, 19.55, 19.41, 19.30, 19.52, 19.01…
#> $ close    <dbl> 20.33, 20.52, 20.76, 19.51, 20.12, 19.52, 19.47, 19.82, 19.09…
#> $ volume   <dbl> 50084000, 61475200, 58083400, 72709900, 70255400, 49815300, 5…
#> $ adjusted <dbl> 15.86624, 16.01451, 16.20183, 15.22628, 15.70234, 15.23408, 1…
# Apply your data transformation skills!

sp_500_daily_returns_tbl <- sp_500_prices_tbl %>% 
    select(symbol, date, adjusted) %>%          
    filter(date >= "2018-01-01") %>%            
    group_by(symbol) %>% 
    mutate(adj_lag = lag(adjusted)) %>% 
    filter(!is.na(adj_lag)) %>% 
    mutate(diff = adjusted - adj_lag,
           pct_return = diff / adj_lag) %>% 
    select(symbol, date, pct_return)

sp_500_daily_returns_tbl

2.2 Step 2 - Convert to User-Item Format

sp_500_daily_returns_tbl <- read_rds("F:/Sneha/ss23-bdml-SnehaPara/Dataset/sp_500_daily_returns_tbl.rds")
sp_500_daily_returns_tbl
# Convert to User-Item Format
stock_date_matrix_tbl <- sp_500_daily_returns_tbl %>% 
    pivot_wider(names_from = date,
                values_from = pct_return,
                values_fill = list(pct_return = 0))
stock_date_matrix_tbl

2.3 Step 3 - Perform K-Means Clustering

# Create kmeans_obj for 4 centers
kmeans_obj <- stock_date_matrix_tbl %>% 
    select(-symbol) %>% 
    kmeans(centers = 4, nstart = 20)
kmeans_obj %>% glance()
# Apply glance() to get the tot.withinss

2.4 Step 4 - Find the optimal value of K

kmeans_mapper <- function(center = 3) {
    stock_date_matrix_tbl %>%
        select(-symbol) %>%
        kmeans(centers = center, nstart = 20)
}
# Use purrr to map
k_means_mapped_tbl <- tibble(centers = 1:30) %>% 
    mutate(k_means = centers %>% map(kmeans_mapper),
           glance = k_means %>% map(glance))


k_means_mapped_tbl 
# Visualize Scree Plot
k_means_mapped_tbl %>% 
    unnest(glance) %>% 
    ggplot(aes(x = centers, y = tot.withinss)) +
    geom_point() +
    geom_line()

2.5 Step 5 - Apply UMAP

k_means_mapped_tbl <- read_rds("F:/Sneha/ss23-bdml-SnehaPara/Dataset/k_means_mapped_tbl.rds")
# Apply UMAP
umap_results <- stock_date_matrix_tbl %>% 
    select(-symbol) %>% 
    umap() 
umap_results 
#> umap embedding of 502 items in 2 dimensions
#> object components: layout, data, knn, config
# Convert umap results to tibble with symbols
umap_results_tbl <- umap_results$layout %>%
    as_tibble() %>%
    bind_cols(stock_date_matrix_tbl %>% select(symbol))
#> Warning: The `x` argument of `as_tibble.matrix()` must have unique column names if
#> `.name_repair` is omitted as of tibble 2.0.0.
#> ℹ Using compatibility `.name_repair`.
 umap_results_tbl
# Visualize UMAP results
umap_results_tbl %>% 
    ggplot(aes(V1, V2)) +
    geom_point(alpha = 0.5) +
    theme_tq() +
    labs(title = "UMAP Projection")

2.6 Step 6 - Combine K-Means and UMAP

k_means_mapped_tbl <- read_rds("F:/Sneha/ss23-bdml-SnehaPara/Dataset/k_means_mapped_tbl.rds")
umap_results_tbl   <- read_rds("F:/Sneha/ss23-bdml-SnehaPara/Dataset/umap_results_tbl.rds")
# Get the k_means_obj from the 10th center
k_means_obj <- k_means_mapped_tbl %>% 
    filter(centers == 10) %>% 
    pull(k_means) %>% 
    pluck(1)

# Store as k_means_obj
# Use your dplyr & broom skills to combine the k_means_obj with the umap_results_tbl
umap_kmeans_results_tbl <- k_means_obj %>%
    augment(stock_date_matrix_tbl) %>%
    select(symbol, .cluster) %>%
    left_join(umap_results_tbl,
              by = "symbol") %>%
    left_join(sp_500_index_tbl %>% select(symbol, company, sector),
              by = "symbol")
 umap_kmeans_results_tbl 
# Visualize the combined K-Means and UMAP results
umap_kmeans_results_tbl %>% 
    ggplot(aes(V1, V2, color = .cluster)) +
    geom_point(alpha = 0.5) +
    theme_tq() +
    scale_color_tq()